clear 
capture log close
log using institution_characteristics.txt, text replace

/**************************************
Author: Kevin Stange
Date: 06/25/2021
Update: 6/25/2021
Update: 5/26/2022
Update: 7/20/2022
Update: 1/4/2024

Important note: need to set cd to file location: 
(e.g. .\Github\footestange_biaspaper\analysis_progs)

This program assembles institution characteristics for our sample, including
computing differences between our "Top" and "Non Top" schools
The institution sample comes from College Scorecard 2006-7 File. 
*************************************/

#delimit ; 
global graphdir "~\Documents\Github\footestange_biaspaper\paper\figures" ; 
global datadir "~\Documents\Github\footestange_biaspaper\disclosed_output";
global ancillarydir "~\Documents\Github\footestange_biaspaper\ancillary_data" ;
set more off ; 
set scheme s1mono ;




/****************************
Read in and save college scorecard data
*****************************/
import delimited "$ancillarydir\MERGED2006_07_PP.csv", clear ;
keep unitid opeid opeid6 instnm control st_fips preddeg main adm_rate adm_rate_all  sat_avg sat_avg_all inexpfte tuitfte tuitionfee_in pctpell satmt25 satmt75 ugds c150_4 ;


foreach var of varlist opeid control adm_rate adm_rate_all  sat_avg sat_avg_all inexpfte tuitfte tuitionfee_in pctpell satmt25 satmt75 ugds c150_4 {;
gen `var'n = real(`var');
drop `var';
rename `var'n `var';
};

keep if control == 1 & preddeg==3 ;

label def statel 8 "CO" 36 "NY" 39 "OH" 42 "PA" 48 "TX" 13 "GA" 29 "MO" 27 "MN" 40 "OK" 49 "UT" 51 "VA";
label val st_fips statel ;

keep if inlist(st_fips, 8, 36, 39, 42, 48, 13, 27, 29, 40, 49, 51);

tab st_fips, m ;


gen sat_math = (satmt25+satmt75)/2 ;

 * Drop Air Force and West POint ;
 drop if unitid == 128328  | unitid == 197036 ;
 
 * Drop non-Penn State institutions in Pennsylvania ;
 gen penn_state = (unitid >= 214000 & unitid <= 214999);
 sort instnm ;
 list unitid penn_state instnm ugds if st_fips==42 ;
 * Note: Millersville University of Pennsylvania is NOT a Penn State institution so change that ;
  replace penn_state = 0 if unitid == 214041;
 * Also note that some of the institutions are 2-year only so will not be in our data of BA grads;
 * Maybe I should get number of BA degrees granted too;
 drop if st_fips==42 & penn_state==0;
 * Also note that spending data for Penn State is combined and rolled up to the parent (Main campus), so will be set to missing;
 replace inexpfte = . if penn_state == 1;
 
  
 * Define and list top schools. We define topschool based on SAT math at a point that sweeps in CU Boulder;
 gen topschool2 = (sat_math >= 595 & sat_math != .);
 gen topschool1 = (sat_avg >= 1160 & sat_avg != .);
 * Set University of Utah as Top School. It has slightly higher SAT, higher grad rate, higher spending, lower admit rate even if sat is not above the threshold we set
 * Utah State has worse metrics (slightly). Weber state is open admissions, so has no SAT info ;
 replace topschool2 = 1 if unitid == 230764;
 replace topschool1 = 1 if unitid == 230764;
 
  * Now merge on migration data from LI;
 
 merge 1:1 unitid using "$ancillarydir\InstitutionFile.dta", keep(master match) keepusing(pginmsa pginstate pginstate_adj hhi distance_w avgmktwage inst_flagship pfinstate);
 
 
 
 gsort st_fips -sat_avg;
 list st_fips topschool1 topschool2 unitid opeid instnm  ugds sat_avg sat_math  c150_4 inexpfte adm_rate pginstate pginstate_adj pfinstate, string(35) sepby(st_fips);
 
* Drop Oklahoma;
drop if st_fips == 40;

preserve ;
collapse (count) count = unitid (mean) ugds sat_avg sat_math  c150_4  inexpfte adm_rate pginstate pginstate_adj pfinstate [aw = ugds], by(topschool1)  ;
 list topschool count ugds sat_avg sat_math  c150_4  inexpfte adm_rate pginstate pginstate_adj pfinstate ;
 export excel using "$graphdir\topschool_table.xlsx", sheet("combined", replace) firstrow(var) ;
 restore ;
 
 preserve ;
collapse (count) count = unitid (mean) ugds sat_avg sat_math  c150_4  inexpfte adm_rate pginstate pginstate_adj pfinstate [aw = ugds], by(topschool1 st_fips) ;
label val st_fips statel ;
sort st_fips topschool1 ;
list st_fips topschool1 count ugds sat_avg sat_math  c150_4  inexpfte adm_rate pginstate pginstate_adj pfinstate, sepby(st_fips) ;
 export excel using "$graphdir\topschool_table.xlsx", sheet("bystate", replace) firstrow(var) ;
 restore ;
 
 
 
log close;
